-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Conversation
benaadams
commented
Dec 11, 2017
•
edited
Loading
edited
PTAL @jkotas |
Could you please also do sanity check how the change affects throughput e.g. run |
Added end of function to asm also (post loop) |
Hmm... will work on it a bit more... https://gist.github.com/benaadams/3e2ba5bbc0f265f7ac2dbd5a7f2b1b67 Before
After
|
Just trying to work on when the loop isn't run (simple versions) |
d1f4d47
to
d5feaf9
Compare
Last set of changes d5feaf9 are pretty subtle; but think got there?
Before
After
|
Before asm ; V00 this [V00,T03] ( 27, 43.50) ref -> rsi this class-hnd
; V01 arg1 [V01,T14] ( 5, 5.50) ref -> rdi class-hnd
; V02 loc0 [V02,T09] ( 10, 15.50) int -> rbx
; V03 loc1 [V03,T02] ( 24, 62 ) int -> r15
; V04 tmp0 [V04,T19] ( 5, 4.50) long -> rcx
; V05 tmp1 [V05,T20] ( 3, 3 ) ref -> rbx class-hnd
; V06 tmp2 [V06,T18] ( 6, 5.50) long -> r11
;* V07 tmp3 [V07 ] ( 0, 0 ) long -> zero-ref
; V08 tmp4 [V08,T12] ( 3, 10 ) long -> rcx
; V09 tmp5 [V09,T11] ( 3, 12 ) ref -> r13 class-hnd
; V10 tmp6 [V10,T08] ( 4, 16 ) ref -> [rsp+0x28] class-hnd
; V11 tmp7 [V11,T06] ( 6, 22 ) long -> r11
;* V12 tmp8 [V12 ] ( 0, 0 ) long -> zero-ref
; V13 tmp9 [V13,T10] ( 13, 13 ) ref -> rcx
; V14 tmp10 [V14,T07] ( 18, 18 ) int -> rdx
; V15 tmp11 [V15,T00] ( 9, 72 ) ref -> rcx
; V16 tmp12 [V16,T05] ( 9, 36 ) ref -> rcx
; V17 tmp13 [V17,T01] ( 9, 72 ) ref -> rax
; V18 OutArgs [V18 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V19 cse0 [V19,T15] ( 6, 6 ) long -> r14
; V20 cse1 [V20,T04] ( 12, 42 ) long -> r12
; V21 cse2 [V21,T13] ( 17, 8.50) int -> r8
; V22 cse3 [V22,T17] ( 11, 5.50) ref -> rax
; V23 cse4 [V23,T16] ( 6, 6 ) long -> rbp
;
; Lcl frame size = 56
G_M16822_IG01:
push r15
push r14
push r13
push r12
push rdi
push rsi
push rbp
push rbx
sub rsp, 56
mov qword ptr [rsp+30H], rcx
mov rsi, rcx
mov rdi, rdx
G_M16822_IG02:
test rdi, rdi
je G_M16822_IG13
G_M16822_IG03:
cmp gword ptr [rsi+8], 0
je G_M16822_IG08
mov rbx, gword ptr [rsi+24]
mov rbp, qword ptr [rsi]
mov rcx, rbp
mov rdx, qword ptr [rcx+48]
mov r14, qword ptr [rdx]
mov r11, qword ptr [r14+136]
test r11, r11
jne SHORT G_M16822_IG04
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
G_M16822_IG04:
mov rcx, rbx
mov rdx, rdi
cmp dword ptr [rcx], ecx
call qword ptr [r11]
mov ebx, eax
and ebx, 0xD1FFAB1E
mov rax, gword ptr [rsi+8]
mov rcx, rax
mov r8d, dword ptr [rax+8]
mov eax, ebx
cdq
idiv edx:eax, r8d
cmp edx, r8d
jae G_M16822_IG14 ; CORINFO_HELP_RNGCHKFAIL
movsxd rdx, edx
mov r15d, dword ptr [rcx+4*rdx+16]
test r15d, r15d
jl G_M16822_IG08
G_M16822_IG05: ; Loop start
mov rcx, gword ptr [rsi+16]
cmp r15d, dword ptr [rcx+8]
jae G_M16822_IG14 ; CORINFO_HELP_RNGCHKFAIL
movsxd rdx, r15d
lea r12, [rdx+2*rdx]
cmp dword ptr [rcx+8*r12+32], ebx
jne SHORT G_M16822_IG07
mov r13, gword ptr [rsi+24]
mov rcx, gword ptr [rsi+16]
cmp r15d, dword ptr [rcx+8]
jae G_M16822_IG14 ; CORINFO_HELP_RNGCHKFAIL
mov rax, gword ptr [rcx+8*r12+16]
mov gword ptr [rsp+28H], rax
mov rcx, rbp
mov r11, qword ptr [r14+144]
test r11, r11
jne SHORT G_M16822_IG10
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
mov rax, gword ptr [rsp+28H]
G_M16822_IG06:
mov rcx, r13
mov rdx, rax
mov r8, rdi
cmp dword ptr [rcx], ecx
call qword ptr [r11]
test eax, eax
jne SHORT G_M16822_IG11
G_M16822_IG07:
mov rax, gword ptr [rsi+16]
cmp r15d, dword ptr [rax+8]
jae G_M16822_IG14 ; CORINFO_HELP_RNGCHKFAIL
mov r15d, dword ptr [rax+8*r12+36]
test r15d, r15d
jge G_M16822_IG05 ; Loop end
G_M16822_IG08:
mov eax, -1
G_M16822_IG09:
add rsp, 56
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r13
pop r14
pop r15
ret
G_M16822_IG10:
mov rax, gword ptr [rsp+28H]
jmp SHORT G_M16822_IG06
G_M16822_IG11:
mov eax, r15d
G_M16822_IG12:
add rsp, 56
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r13
pop r14
pop r15
ret
************** Beginning of cold code **************
G_M16822_IG13:
mov ecx, 4
call ThrowHelper:ThrowArgumentNullException(int)
int3
G_M16822_IG14:
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 356, prolog size 27 for method Dictionary`2:FindEntry(ref):int:this After asm ; V00 this [V00,T10] ( 9, 6 ) ref -> rsi this class-hnd
; V01 arg1 [V01,T11] ( 5, 5.50) ref -> rdi class-hnd
; V02 loc0 [V02,T14] ( 9, 5.50) ref -> rbx class-hnd
; V03 loc1 [V03,T00] ( 19, 52 ) int -> rbp
; V04 loc2 [V04,T18] ( 4, 3.50) ref -> r14 class-hnd
; V05 loc3 [V05,T06] ( 8, 18 ) int -> r13
; V06 loc4 [V06,T01] ( 15, 43 ) ref -> rbx class-hnd
; V07 tmp0 [V07,T16] ( 5, 4.50) long -> rcx
; V08 tmp1 [V08,T15] ( 6, 5.50) long -> r11
;* V09 tmp2 [V09 ] ( 0, 0 ) long -> zero-ref
; V10 tmp3 [V10,T08] ( 3, 10 ) long -> rcx
; V11 tmp4 [V11,T05] ( 5, 20 ) ref -> [rsp+0x28] class-hnd
; V12 tmp5 [V12,T04] ( 6, 22 ) long -> r11
;* V13 tmp6 [V13 ] ( 0, 0 ) long -> zero-ref
; V14 tmp7 [V14,T09] ( 9, 9 ) int -> rdx
; V15 OutArgs [V15 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V16 cse0 [V16,T02] ( 10, 40 ) byref -> [rsp+0x20]
; V17 cse1 [V17,T12] ( 6, 6 ) long -> r12
; V18 cse2 [V18,T03] ( 10, 32 ) long -> rcx
; V19 cse3 [V19,T07] ( 5, 18 ) int -> rcx
; V20 cse4 [V20,T13] ( 6, 6 ) long -> r15
; V21 cse5 [V21,T17] ( 8, 4 ) int -> rcx
;
; Lcl frame size = 56
G_M16822_IG01:
push r15
push r14
push r13
push r12
push rdi
push rsi
push rbp
push rbx
sub rsp, 56
mov qword ptr [rsp+30H], rcx
mov rsi, rcx
mov rdi, rdx
G_M16822_IG02:
test rdi, rdi
je G_M16822_IG10
G_M16822_IG03:
mov rbx, gword ptr [rsi+8]
mov ebp, -1
test rbx, rbx
je G_M16822_IG08
mov r14, gword ptr [rsi+24]
mov r15, qword ptr [rsi]
mov rcx, r15
mov rdx, qword ptr [rcx+48]
mov r12, qword ptr [rdx]
mov r11, qword ptr [r12+136]
test r11, r11
jne SHORT G_M16822_IG04
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
G_M16822_IG04:
mov rcx, r14
mov rdx, rdi
cmp dword ptr [rcx], ecx
call qword ptr [r11]
mov r13d, eax
and r13d, 0xD1FFAB1E
mov ecx, dword ptr [rbx+8]
mov eax, r13d
cdq
idiv edx:eax, ecx
cmp edx, ecx
jae G_M16822_IG11 ; CORINFO_HELP_RNGCHKFAIL
movsxd rcx, edx
mov ebp, dword ptr [rbx+4*rcx+16]
mov rbx, gword ptr [rsi+16]
test ebp, ebp
jl SHORT G_M16822_IG08
G_M16822_IG05: ; Loop start
mov ecx, dword ptr [rbx+8]
cmp ebp, ecx
jae G_M16822_IG11 ; CORINFO_HELP_RNGCHKFAIL
movsxd rcx, ebp
lea rcx, [rcx+2*rcx]
lea rax, bword ptr [rbx+8*rcx+16]
mov bword ptr [rsp+20H], rax
cmp dword ptr [rax+16], r13d
jne SHORT G_M16822_IG07
mov r8, gword ptr [rbx+8*rcx+16]
mov gword ptr [rsp+28H], r8
mov rcx, r15
mov r11, qword ptr [r12+144]
test r11, r11
jne SHORT G_M16822_IG06
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
G_M16822_IG06:
mov rcx, r14
mov rdx, rdi
mov r8, gword ptr [rsp+28H]
cmp dword ptr [rcx], ecx
call qword ptr [r11]
test eax, eax
jne SHORT G_M16822_IG08
G_M16822_IG07:
mov rax, bword ptr [rsp+20H]
mov ebp, dword ptr [rax+20]
test ebp, ebp
jge SHORT G_M16822_IG05 ; Loop end
G_M16822_IG08:
mov eax, ebp
G_M16822_IG09:
add rsp, 56
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r13
pop r14
pop r15
ret
************** Beginning of cold code **************
G_M16822_IG10:
mov ecx, 4
call ThrowHelper:ThrowArgumentNullException(int)
int3
G_M16822_IG11:
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 295, prolog size 27 for method Dictionary`2:FindEntry(ref):int:this |
{ | ||
if (_entries[i].hashCode == hashCode && _comparer.Equals(_entries[i].key, key)) return i; | ||
if (entries[i].hashCode == hashCode && comparer.Equals(key, entries[i].key)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is likely that there is (fragile) code out there that depends on the exact order of arguments passed into the comparer. Switching the order is going to break it. Could you please keep preserve the argument order?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Switched, though it increases code size. However, have found a pattern to drop the range check which makes up for it
|
@mikedn moving do // or while(true)
{
if ((uint)i >= (uint)entries.Length || (entries[i] |
G_M16822_IG01:
push r15
push r14
push r13
push r12
push rdi
push rsi
push rbp
push rbx
sub rsp, 56
mov qword ptr [rsp+30H], rcx
mov rsi, rcx
mov rdi, rdx
G_M16822_IG02:
test rdi, rdi
je G_M16822_IG11
G_M16822_IG03:
mov rbx, gword ptr [rsi+8]
mov ebp, -1
test rbx, rbx
je G_M16822_IG09
mov r14, gword ptr [rsi+24]
mov r15, qword ptr [rsi]
mov rcx, r15
mov rdx, qword ptr [rcx+48]
mov r12, qword ptr [rdx]
mov r11, qword ptr [r12+136]
test r11, r11
jne SHORT G_M16822_IG04
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
G_M16822_IG04:
mov rcx, r14
mov rdx, rdi
cmp dword ptr [rcx], ecx
call qword ptr [r11]
mov r13d, eax
and r13d, 0xD1FFAB1E
mov ecx, dword ptr [rbx+8]
mov eax, r13d
cdq
idiv edx:eax, ecx
cmp edx, ecx
jae G_M16822_IG12 ; CORINFO_HELP_RNGCHKFAIL
movsxd rcx, edx
mov ebp, dword ptr [rbx+4*rcx+16]
mov rbx, gword ptr [rsi+16]
G_M16822_IG05: ; Loop start
mov ecx, dword ptr [rbx+8]
cmp ecx, ebp
jbe SHORT G_M16822_IG09
movsxd rcx, ebp
lea rcx, [rcx+2*rcx]
lea rax, bword ptr [rbx+8*rcx+16]
mov bword ptr [rsp+20H], rax
cmp dword ptr [rax+16], r13d
jne SHORT G_M16822_IG07
mov r8, gword ptr [rbx+8*rcx+16]
mov gword ptr [rsp+28H], r8
mov rcx, r15
mov r11, qword ptr [r12+144]
test r11, r11
jne SHORT G_M16822_IG08
lea rdx, [(reloc)]
call CORINFO_HELP_RUNTIMEHANDLE_CLASS
mov r11, rax
mov r8, gword ptr [rsp+28H]
G_M16822_IG06:
mov rcx, r14
mov rdx, r8
mov r8, rdi
cmp dword ptr [rcx], ecx
call qword ptr [r11]
test eax, eax
jne SHORT G_M16822_IG09
G_M16822_IG07:
mov rax, bword ptr [rsp+20H]
mov ebp, dword ptr [rax+20]
jmp SHORT G_M16822_IG05 ; Loop end
G_M16822_IG08:
mov r8, gword ptr [rsp+28H]
jmp SHORT G_M16822_IG06
G_M16822_IG09:
mov eax, ebp
G_M16822_IG10:
add rsp, 56
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r13
pop r14
pop r15
ret
************** Beginning of cold code **************
G_M16822_IG11:
mov ecx, 4
call ThrowHelper:ThrowArgumentNullException(int)
int3
G_M16822_IG12:
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 295, prolog size 27 for method Dictionary`2:FindEntry(ref):int:this |
0c69672
to
b73da8e
Compare
shorter G_M6157_IG01:
push r15
push r14
push r12
push rdi
push rsi
push rbp
push rbx
sub rsp, 32
mov rdi, rcx
mov esi, edx
G_M6157_IG02:
mov rbx, gword ptr [rdi+8]
mov ebp, -1
test rbx, rbx
je SHORT G_M6157_IG05
mov r14, gword ptr [rdi+24]
mov rcx, r14
mov edx, esi
lea r11, [(reloc)]
cmp dword ptr [rcx], ecx
call qword ptr [r11]IEqualityComparer`1:GetHashCode(int):int:this
mov r15d, eax
and r15d, 0xD1FFAB1E
mov ecx, dword ptr [rbx+8]
mov eax, r15d
cdq
idiv edx:eax, ecx
cmp edx, ecx
jae SHORT G_M6157_IG07 ; CORINFO_HELP_RNGCHKFAIL
movsxd rdx, edx
mov ebp, dword ptr [rbx+4*rdx+16]
mov rdi, gword ptr [rdi+16]
mov ebx, dword ptr [rdi+8]
G_M6157_IG03: ; Loop start
cmp ebx, ebp
jbe SHORT G_M6157_IG05
movsxd rdx, ebp
mov r12, rdx
shl r12, 4
cmp dword ptr [rdi+r12+16], r15d
jne SHORT G_M6157_IG04
mov edx, dword ptr [rdi+r12+24]
mov rcx, r14
mov r8d, esi
lea r11, [(reloc)]
cmp dword ptr [rcx], ecx
call qword ptr [r11]IEqualityComparer`1:Equals(int,int):bool:this
test eax, eax
jne SHORT G_M6157_IG05
G_M6157_IG04:
mov ebp, dword ptr [rdi+r12+20]
jmp SHORT G_M6157_IG03 ; Loop end
G_M6157_IG05:
mov eax, ebp
G_M6157_IG06:
add rsp, 32
pop rbx
pop rbp
pop rsi
pop rdi
pop r12
pop r14
pop r15
ret
G_M6157_IG07:
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 169, prolog size 19 for method Dictionary`2:FindEntry(int):int:this |
Pre
Post
|
While here, thought I'd just checking what the lower bound is for using with also having mod Length recognized as bounds safe by switching i = buckets[hashCode % buckets.Length]; For i = Unsafe.Add(ref Unsafe.As<byte, int>(ref buckets.GetRawSzArrayData()), hashCode % buckets.Length);
|
The jit ought to be able to recognize on its own that an index of the form Can you open an issue for this? |
Hmm, do I understand correctly from your benchmark results that it's 2x faster if you get rid of the range check?!!
We discussed this in the past. Assertion propagation could do this if |
No, most of the speed was from using Just wanted to know if it was worth pursuing the gains |
0-length arrays, might be a problem? Raised issue https://github.com/dotnet/coreclr/issues/15472 I could add to [MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int GetValueAtIndexModLength(this int[] array, int index)
{
// Should be: https://github.com/dotnet/coreclr/issues/15472
// return buckets[index % array.Length];
// however the Jit doesn't recognise "mod Length" as bounds safe, so introduces a range check
return Unsafe.Add(ref Unsafe.As<byte, int>(ref array.GetRawSzArrayData()), index % array.Length);
} Seems to work asm wise |
You really should measure the effect of eliminating that range check. I doubt that it's significant enough to warrant such a hack. After all we have a division operation that requires 26 cycles and a compare and never taken branch that take 2 cycles and are possible executed in parallel with other instructions. |
Takes it back under the before change for very simple ContainsKey #15460 (comment) Equally switching the params from I don't think 0.5ns is hugely significant, but it irks me that its not an improvement across the board. Obviously the interface and idiv are far bigger items 😄 |
@dotnet-bot test Windows_NT x64 Checked corefx_baseline |
Contract failure prevents the test run
|
@dotnet-bot test Windows_NT x64 Checked corefx_baseline |
b73da8e
to
2ea4ec2
Compare
2ea4ec2
to
25dd8d9
Compare
@dotnet-bot test Windows_NT x64 Checked corefx_baseline |
Failure is independent of this change https://github.com/dotnet/coreclr/issues/15537
|
Adding devirtualization (with a switch on
|
Incorporated into #15419 as a commit |